The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.

Dependencies

User defined parameters

print(params)
## $run
## [1] TRUE
## 
## $test_run
## [1] FALSE
## 
## $save_figs
## [1] FALSE
## 
## $ecoregion
## [1] "forest"
## 
## $response
## [1] "BareGroundCover"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
response <- params$response
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)


run <- params$run
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
 
library(ggspatial)
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc. 
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
library(here)
library(rsample)
library(kableExtra)
library(glmnet)
library(USA.state.boundaries)

read in data

Data compiled in the prepDataForModels.R script

here::i_am("Analysis/VegComposition/ModelFitting/02_ModelFitting.Rmd")
modDat <- readRDS( here("Data_processed", "CoverData", "DataForModels_spatiallyAveraged_withSoils_noSf.rds"))
## there are some values of the annual wet degree days 5th percentile that have -Inf?? change to lowest value for now? 
modDat[is.infinite(modDat$annWetDegDays_5percentile_3yrAnom), "annWetDegDays_5percentile_3yrAnom"] <- -47.8
## same, but for annual water deficit 95th percentile 
modDat[is.infinite(modDat$annWaterDeficit_95percentile_3yrAnom), "annWaterDeficit_95percentile_3yrAnom"] <- -600

# # Convert total cover variables into proportions (for later use in beta regression models) ... proportions are already scaled from zero to 1
# modDat <- modDat %>%
#   mutate(TotalTreeCover = TotalTreeCover/100,
#          CAMCover = CAMCover/100,
#          TotalHerbaceousCover = TotalHerbaceousCover/100,
#          BareGroundCover = BareGroundCover/100,
#          ShrubCover = ShrubCover/100
#          )
# For all response variables, make sure there are no 0s add or subtract .0001 from each, since the Gamma model framework can't handle that
modDat[modDat$TotalTreeCover == 0 & !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.0001
modDat[modDat$CAMCover == 0 & !is.na(modDat$CAMCover), "CAMCover"] <- 0.0001
modDat[modDat$TotalHerbaceousCover == 0  & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.0001
modDat[modDat$BareGroundCover == 0 & !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.0001
modDat[modDat$ShrubCover == 0 & !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.0001
modDat[modDat$BroadleavedTreeCover_prop == 0 & !is.na(modDat$BroadleavedTreeCover_prop), "BroadleavedTreeCover_prop"] <- 0.0001
modDat[modDat$NeedleLeavedTreeCover_prop == 0 & !is.na(modDat$NeedleLeavedTreeCover_prop), "NeedleLeavedTreeCover_prop"] <- 0.0001
modDat[modDat$C4Cover_prop == 0 & !is.na(modDat$C4Cover_prop), "C4Cover_prop"] <- 0.0001
modDat[modDat$C3Cover_prop == 0 & !is.na(modDat$C3Cover_prop), "C3Cover_prop"] <- 0.0001
modDat[modDat$ForbCover_prop == 0 & !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.0001
# 
# modDat[modDat$TotalTreeCover ==1& !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.999
# modDat[modDat$CAMCover ==1& !is.na(modDat$CAMCover), "CAMCover"] <- 0.999
# modDat[modDat$TotalHerbaceousCover ==1 & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.999
# modDat[modDat$BareGroundCover ==1& !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.999
# modDat[modDat$ShrubCover ==1& !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.999
# modDat[modDat$BroadleavedTreeCover_prop ==1& !is.na(modDat$BroadleavedTreeCover_prop), "BroadleavedTreeCover_prop"] <- 0.999
# modDat[modDat$NeedleLeavedTreeCover_prop ==1& !is.na(modDat$NeedleLeavedTreeCover_prop), "NeedleLeavedTreeCover_prop"] <- 0.999
# modDat[modDat$C4Cover_prop ==1& !is.na(modDat$C4Cover_prop), "C4Cover_prop"] <- 0.999
# modDat[modDat$C3Cover_prop ==1& !is.na(modDat$C3Cover_prop), "C3Cover_prop"] <- 0.999
# modDat[modDat$ForbCover_prop ==1& !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.999

Prep data

set.seed(1234)
modDat_1 <- modDat %>% 
  select(-c(prcp_annTotal:annVPD_min)) %>% 
  # mutate(Lon = st_coordinates(.)[,1], 
  #        Lat = st_coordinates(.)[,2])  %>% 
  # st_drop_geometry() %>% 
  # filter(!is.na(newRegion))
  rename("tmin" = tmin_meanAnnAvg_CLIM, 
     "tmax" = tmax_meanAnnAvg_CLIM, #1
     "tmean" = tmean_meanAnnAvg_CLIM, 
     "prcp" = prcp_meanAnnTotal_CLIM, 
     "t_warm" = T_warmestMonth_meanAnnAvg_CLIM,
     "t_cold" = T_coldestMonth_meanAnnAvg_CLIM, 
     "prcp_wet" = precip_wettestMonth_meanAnnAvg_CLIM,
     "prcp_dry" = precip_driestMonth_meanAnnAvg_CLIM, 
     "prcp_seasonality" = precip_Seasonality_meanAnnAvg_CLIM, #2
     "prcpTempCorr" = PrecipTempCorr_meanAnnAvg_CLIM,  #3
     "abvFreezingMonth" = aboveFreezing_month_meanAnnAvg_CLIM, 
     "isothermality" = isothermality_meanAnnAvg_CLIM, #4
     "annWatDef" = annWaterDeficit_meanAnnAvg_CLIM, 
     "annWetDegDays" = annWetDegDays_meanAnnAvg_CLIM,
     "VPD_mean" = annVPD_mean_meanAnnAvg_CLIM, 
     "VPD_max" = annVPD_max_meanAnnAvg_CLIM, #5
     "VPD_min" = annVPD_min_meanAnnAvg_CLIM, #6
     "VPD_max_95" = annVPD_max_95percentile_CLIM, 
     "annWatDef_95" = annWaterDeficit_95percentile_CLIM, 
     "annWetDegDays_5" = annWetDegDays_5percentile_CLIM, 
     "frostFreeDays_5" = durationFrostFreeDays_5percentile_CLIM, 
     "frostFreeDays" = durationFrostFreeDays_meanAnnAvg_CLIM, 
     "soilDepth" = soilDepth, #7
     "clay" = surfaceClay_perc, 
     "sand" = avgSandPerc_acrossDepth, #8
     "coarse" = avgCoarsePerc_acrossDepth, #9
     "carbon" = avgOrganicCarbonPerc_0_3cm, #10
     "AWHC" = totalAvailableWaterHoldingCapacity,
     ## anomaly variables
     tmean_anom = tmean_meanAnnAvg_3yrAnom, #15
     tmin_anom = tmin_meanAnnAvg_3yrAnom, #16
     tmax_anom = tmax_meanAnnAvg_3yrAnom, #17
    prcp_anom = prcp_meanAnnTotal_3yrAnom, #18
      t_warm_anom = T_warmestMonth_meanAnnAvg_3yrAnom,  #19
     t_cold_anom = T_coldestMonth_meanAnnAvg_3yrAnom, #20
      prcp_wet_anom = precip_wettestMonth_meanAnnAvg_3yrAnom, #21
      precp_dry_anom = precip_driestMonth_meanAnnAvg_3yrAnom,  #22
    prcp_seasonality_anom = precip_Seasonality_meanAnnAvg_3yrAnom, #23 
     prcpTempCorr_anom = PrecipTempCorr_meanAnnAvg_3yrAnom, #24
      aboveFreezingMonth_anom = aboveFreezing_month_meanAnnAvg_3yrAnom, #25  
    isothermality_anom = isothermality_meanAnnAvg_3yrAnom, #26
       annWatDef_anom = annWaterDeficit_meanAnnAvg_3yrAnom, #27
     annWetDegDays_anom = annWetDegDays_meanAnnAvg_3yrAnom,  #28
      VPD_mean_anom = annVPD_mean_meanAnnAvg_3yrAnom, #29
      VPD_min_anom = annVPD_min_meanAnnAvg_3yrAnom,  #30
      VPD_max_anom = annVPD_max_meanAnnAvg_3yrAnom,  #31
     VPD_max_95_anom = annVPD_max_95percentile_3yrAnom, #32
      annWatDef_95_anom = annWaterDeficit_95percentile_3yrAnom, #33 
      annWetDegDays_5_anom = annWetDegDays_5percentile_3yrAnom ,  #34
    frostFreeDays_5_anom = durationFrostFreeDays_5percentile_3yrAnom, #35 
      frostFreeDays_anom = durationFrostFreeDays_meanAnnAvg_3yrAnom #36
  )

# small dataset for if testing the data
if(test_run) {
  modDat_1 <- slice_sample(modDat_1, n = 1e5)
}

Add a constant to the response variable (+1) so that models run…

modDat_1[,response] <- modDat_1[,response]+2

Identify the ecoregion and response variable type to use in this model run

ecoregion <- params$ecoregion
response <- params$response
print(paste0("In this model run, the ecoregion is ", ecoregion," and the response variable is ",response))
## [1] "In this model run, the ecoregion is forest and the response variable is BareGroundCover"

Subset the data to only include data for the ecoregion of interest

if (ecoregion == "shrubGrass") {
  # select data for the ecoregion of interest
  modDat_1 <- modDat_1 %>%
    filter(newRegion == "dryShrubGrass")
} else if (ecoregion == "forest") {
  # select data for the ecoregion of interest
  modDat_1 <- modDat_1 %>% 
    filter(newRegion %in% c("eastForest", "westForest"))
}

# remove the rows that have no observations for the response variable of interest
modDat_1 <- modDat_1[!is.na(modDat_1[,response]),]

Currently, subsampling data from the “Texas Coastal Plain”, since it’s quite different from other regions and is really messing with model fit

modDat_1_noLA <- modDat_1 %>% 
  filter(NA_L2NAME != "TEXAS-LOUISIANA COASTAL PLAIN")
modDat_1_LA <- modDat_1 %>% 
  filter(NA_L2NAME == "TEXAS-LOUISIANA COASTAL PLAIN")
# sample points 
modDat_1 <- modDat_1_LA %>% 
  slice_sample(n = round(nrow(modDat_1_LA)*.3)) %>% 
  rbind(modDat_1_noLA) 

Visualize the response variable

hist(modDat_1[,response], main = paste0("Histogram of ",response),
     xlab = paste0(response))

Visualize the predictor variables

The following are the candidate predictor variables for this ecoregion:

if (ecoregion == "shrubGrass") {
  # select potential predictor variables for the ecoregion of interest
        prednames <-
          c(
"tmean"             , "prcp"                    ,"prcp_seasonality"        ,"prcpTempCorr"          , 
"isothermality"     , "annWatDef"               ,"sand"                    ,"coarse"                , 
"carbon"            , "AWHC"                    ,"tmin_anom"               ,"tmax_anom"             , 
"t_warm_anom"       , "prcp_wet_anom"           ,"precp_dry_anom"          ,"prcp_seasonality_anom" , 
"prcpTempCorr_anom" , "aboveFreezingMonth_anom" ,"isothermality_anom"      ,"annWatDef_anom"        , 
"annWetDegDays_anom", "VPD_mean_anom"           ,"VPD_min_anom"            ,"frostFreeDays_5_anom"   )
  
} else if (ecoregion == "forest") {
  # select potential predictor variables for the ecoregion of interest
  prednames <- 
    c(
"tmean"                 ,"prcp"               , "prcp_dry"                , "prcpTempCorr"      ,     
"isothermality"         ,"annWatDef"          , "clay"                    , "sand"              ,     
"coarse"                ,"carbon"             , "AWHC"                    , "tmin_anom"         ,     
"tmax_anom"             ,"prcp_anom"          , "prcp_wet_anom"           , "precp_dry_anom"    ,     
"prcp_seasonality_anom" ,"prcpTempCorr_anom"  , "aboveFreezingMonth_anom" , "isothermality_anom",     
"annWatDef_anom"        ,"annWetDegDays_anom" , "VPD_mean_anom"           , "VPD_max_95_anom"   ,     
"frostFreeDays_5_anom"   )
}

# subset the data to only include these predictors, and remove any remaining NAs 
modDat_1 <- modDat_1 %>% 
  select(prednames, response, newRegion, Year, Long, Lat, NA_L1NAME, NA_L2NAME) %>% 
  drop_na()

names(prednames) <- prednames
df_pred <- modDat_1[, prednames]
# 
# # print the list of predictor variables
# knitr::kable(format = "html", data.frame("Possible_Predictors" = prednames)
# ) %>%
#   kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
create_summary <- function(df) {
  df %>% 
    pivot_longer(cols = everything(),
                 names_to = 'variable') %>% 
    group_by(variable) %>% 
    summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), 
                                        median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>% 
    mutate(across(where(is.numeric), round, 4))
}

modDat_1[prednames] %>% 
  create_summary() %>% 
  knitr::kable(caption = 'summaries of possible predictor variables') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
summaries of possible predictor variables
variable value_mean value_min value_median value_max
AWHC 14.4824 0.9085 13.6143 33.7514
VPD_max_95_anom 0.0681 -0.5726 0.0480 0.8326
VPD_mean_anom -0.0220 -0.3148 -0.0225 0.2651
aboveFreezingMonth_anom 0.1037 -1.7879 0.0667 3.3667
annWatDef 33.3952 0.0000 23.3300 303.5579
annWatDef_anom -0.0333 -8.9984 -0.0586 1.0000
annWetDegDays_anom -0.0096 -1.2398 -0.0237 0.8050
carbon 7.4635 0.2171 4.0615 51.0604
clay 15.2024 0.1664 15.6334 76.9703
coarse 15.5303 0.0000 13.3834 64.1122
frostFreeDays_5_anom -25.5106 -273.1000 -30.0000 53.1000
isothermality 35.9490 21.4503 36.1517 59.8804
isothermality_anom 0.7299 -8.4018 0.7269 11.7898
prcp 1068.7554 210.3373 1093.5862 4069.6417
prcpTempCorr -0.1341 -0.8521 -0.1118 0.7258
prcpTempCorr_anom -0.0067 -0.5978 0.0022 0.6098
prcp_anom -0.0035 -0.8885 -0.0050 0.6720
prcp_dry 17.3386 0.0003 13.1030 68.1677
prcp_seasonality_anom 0.0037 -0.6092 0.0072 0.4788
prcp_wet_anom 0.0004 -1.4179 0.0113 0.6981
precp_dry_anom 0.0430 -9.0000 0.0565 1.0000
sand 46.7606 0.8357 46.2652 98.8121
tmax_anom -0.2699 -5.6017 -0.2791 4.0197
tmean 9.2689 -2.4480 7.9021 24.9713
tmin_anom -0.6385 -5.7692 -0.5876 2.6920
# response_summary <- modDat_1 %>% 
#     dplyr::select(#where(is.numeric), -all_of(pred_vars),
#       matches(response)) %>% 
#     create_summary()
# 
# 
# kable(response_summary, 
#       caption = 'summaries of response variables, calculated using paint') %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 

Plot predictor vars against each other

set.seed(12011993)
# function for colors
my_fn <- function(data, mapping, method="p", use="pairwise", ...){
  
  # grab data
  x <- eval_data_col(data, mapping$x)
  y <- eval_data_col(data, mapping$y)
  
  # calculate correlation
  corr <- cor(x, y, method=method, use=use)
  
  # calculate colour based on correlation value
  # Here I have set a correlation of minus one to blue, 
  # zero to white, and one to red 
  # Change this to suit: possibly extend to add as an argument of `my_fn`
  colFn <- colorRampPalette(c("red", "white", "blue"), interpolate ='spline')
  fill <- colFn(100)[findInterval(corr, seq(-1, 1, length=100))]
  
  ggally_cor(data = data, mapping = mapping, size = 2.5, stars = FALSE, 
             digits = 2, colour = I("black"),...) + 
    theme_void() +
    theme(panel.background = element_rect(fill=fill))
  
}

if (run == TRUE) {
(corrPlot <- modDat_1 %>% 
  select(prednames) %>% 
  slice_sample(n = 5e4) %>% 
  #select(-matches("_")) %>% 
ggpairs( upper = list(continuous = my_fn, size = .1), lower = list(continuous = GGally::wrap("points", alpha = 0.1, size=0.1)), progress = FALSE))
    base::saveRDS(corrPlot, paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
  
  } else {
    # corrPlot <- readRDS(paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
    # (corrPlot)
    print(c("See previous correlation figures"))
  }

Predictor variables compared to binned response variables

set.seed(12011993)
# vector of name of response variables
vars_response <- response

# longformat dataframes for making boxplots
df_sample_plots <-  modDat_1  %>% 
  slice_sample(n = 5e4) %>% 
   rename(response = all_of(response)) %>% 
  mutate(response = case_when(
    response <= .25 ~ ".25", 
    response > .25 & response <=.5 ~ ".5", 
    response > .5 & response <=.75 ~ ".75", 
    response >= .75  ~ "1", 
  )) %>% 
  select(c(response, prednames)) %>% 
  tidyr::pivot_longer(cols = unname(prednames), 
               names_to = "predictor", 
               values_to = "value"
               )  
 

  ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
  geom_boxplot() +
  facet_wrap(~predictor , scales = 'free_y') + 
  ylab("Predictor Variable Values") + 
    xlab(response)

Standardize the predictor variables for the model-fitting process

modDat_1_s <- modDat_1 %>% 
  mutate(across(all_of(prednames), base::scale, .names = "{.col}_s")) 
names(modDat_1_s) <- c(names(modDat_1),
                       paste0(prednames, "_s")
                       )
  
scaleFigDat_1 <- modDat_1_s %>% 
  select(c(Long, Lat, Year, prednames)) %>% 
  pivot_longer(cols = all_of(names(prednames)), 
               names_to = "predNames", 
               values_to = "predValues_unScaled")
scaleFigDat_2 <- modDat_1_s %>% 
  select(c(Long, Lat, Year,paste0(prednames, "_s"))) %>% 
  pivot_longer(cols = all_of(paste0(prednames, "_s")), 
               names_to = "predNames", 
               values_to = "predValues_scaled", 
               names_sep = ) %>% 
  mutate(predNames = str_replace(predNames, pattern = "_s$", replacement = ""))

scaleFigDat_3 <- scaleFigDat_1 %>% 
  left_join(scaleFigDat_2)

ggplot(scaleFigDat_3) + 
  facet_wrap(~predNames, scales = "free") +
  geom_histogram(aes(predValues_unScaled), fill = "lightgrey", col = "darkgrey") + 
  geom_histogram(aes(predValues_scaled), fill = "lightblue", col = "blue") +
  xlab ("predictor variable values") + 
  ggtitle("Comparing the distribution of unscaled (grey) to scaled (blue) predictor variables")

Model Fitting

Visualize the level 2 ecoregions and how they differ across environmental space

## visualize the variation between groups across environmental space

## do a pca of climate across level 2 ecoregions
pca <- prcomp(modDat_1_s[,paste0(prednames, "_s")])
library(factoextra)
(fviz_pca_ind(pca, habillage = modDat_1_s$NA_L2NAME, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))

if (ecoregion == "shrubGrass") {
  print("We'll combine the 'Mediterranean California' and 'Western Sierra Madre Piedmont' ecoregions (into 'Mediterranean Piedmont'). We'll also combine `Tamaulipas-Texas semiarid plain,' 'Texas-Lousiana Coastal plain,' and 'South Central semiarid prairies' ecoregions (into (`Semiarid plain and prairies`)." )
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
  modDat_1[modDat_1$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "TEXAS-LOUISIANA COASTAL PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
  modDat_1[modDat_1$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "TEXAS-LOUISIANA COASTAL PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
}
# make a table of n for each region
modDat_1 %>% 
  group_by(NA_L2NAME) %>% 
  dplyr::summarize("Number_Of_Observations" = length(NA_L2NAME)) %>% 
  rename("Level_2_Ecoregion" = NA_L2NAME)%>% 
  kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Level_2_Ecoregion Number_Of_Observations
ATLANTIC HIGHLANDS 4656
CENTRAL USA PLAINS 1056
EVERGLADES 212
MARINE WEST COAST FOREST 3353
MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS 6116
MIXED WOOD PLAINS 6600
MIXED WOOD SHIELD 6611
OZARK/OUACHITA-APPALACHIAN FORESTS 13291
SOUTHEASTERN USA PLAINS 20850
UPPER GILA MOUNTAINS 6933
WESTERN CORDILLERA 57533

Then, look at the spatial distribution and environmental characteristics of the grouped ecoregions

## make data into spatial format
modDat_1_sf <- modDat_1 %>% 
  st_as_sf(coords = c("Long", "Lat"), crs = st_crs("PROJCRS[\"unnamed\",\n    BASEGEOGCRS[\"unknown\",\n        DATUM[\"unknown\",\n            ELLIPSOID[\"Spheroid\",6378137,298.257223563,\n                LENGTHUNIT[\"metre\",1,\n                    ID[\"EPSG\",9001]]]],\n        PRIMEM[\"Greenwich\",0,\n            ANGLEUNIT[\"degree\",0.0174532925199433,\n                ID[\"EPSG\",9122]]]],\n    CONVERSION[\"Lambert Conic Conformal (2SP)\",\n        METHOD[\"Lambert Conic Conformal (2SP)\",\n            ID[\"EPSG\",9802]],\n        PARAMETER[\"Latitude of false origin\",42.5,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8821]],\n        PARAMETER[\"Longitude of false origin\",-100,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8822]],\n        PARAMETER[\"Latitude of 1st standard parallel\",25,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8823]],\n        PARAMETER[\"Latitude of 2nd standard parallel\",60,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8824]],\n        PARAMETER[\"Easting at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8826]],\n        PARAMETER[\"Northing at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8827]]],\n    CS[Cartesian,2],\n        AXIS[\"easting\",east,\n            ORDER[1],\n            LENGTHUNIT[\"metre\",1,\n                ID[\"EPSG\",9001]]],\n        AXIS[\"northing\",north,\n            ORDER[2],\n            LENGTHUNIT[\"metre\",1,\n                ID[\"EPSG\",9001]]]]"))


# download map info for visualization
data(state_boundaries_wgs84) 

cropped_states <- suppressMessages(state_boundaries_wgs84 %>%
  dplyr::filter(NAME!="Hawaii") %>%
  dplyr::filter(NAME!="Alaska") %>%
  dplyr::filter(NAME!="Puerto Rico") %>%
  dplyr::filter(NAME!="American Samoa") %>%
  dplyr::filter(NAME!="Guam") %>%
  dplyr::filter(NAME!="Commonwealth of the Northern Mariana Islands") %>%
  dplyr::filter(NAME!="United States Virgin Islands") %>%

  sf::st_sf() %>%
  sf::st_transform(sf::st_crs(modDat_1_sf))) #%>%
  #sf::st_crop(sf::st_bbox(modDat_1_sf)+c(-1,-1,1,1))

map1 <- ggplot() +
  geom_sf(data=cropped_states,fill='white') +
  geom_sf(data=modDat_1_sf,aes(fill=as.factor(NA_L2NAME)),linewidth=0.5,alpha=0.5) +
  geom_point(data=modDat_1,alpha=0.5, 
             aes(x = Long, y = Lat, color=as.factor(NA_L2NAME)), alpha = .1) +
  #scale_fill_okabeito() +
  #scale_color_okabeito() +
 # theme_default() +
  theme(legend.position = 'none') +
  labs(title = "Level 2 Ecoregions as spatial blocks")

hull <- modDat_1_sf %>%
  ungroup() %>%
  group_by(NA_L2NAME) %>%
  slice(chull(tmean, prcp))

plot1<-ggplot(data=modDat_1_sf,aes(x=tmean,y=prcp)) +
  geom_polygon(data = hull, alpha = 0.25,aes(fill=NA_L2NAME) )+
  geom_point(aes(group=NA_L2NAME,color=NA_L2NAME),alpha=0.25) +
  theme_minimal() + xlab("Annual Average T_mean - long-term average") +
  ylab("Annual Average Precip - long-term average") #+
  #scale_color_okabeito() +
  #scale_fill_okabeito()

plot2<-ggplot(data=modDat_1_sf %>%
                pivot_longer(cols=tmean:prcp),
              aes(x=value,group=name)) +
  # geom_polygon(data = hull, alpha = 0.25,aes(fill=fold) )+
  geom_density(aes(group=NA_L2NAME,fill=NA_L2NAME),alpha=0.25) +
  theme_minimal() +
  facet_wrap(~name,scales='free')# +
  #scale_color_okabeito() +
  #scale_fill_okabeito()
 
library(patchwork)
(combo <- (map1+plot1)/plot2) 

Fit a global model with all of the data

First, fit a LASSO regression model using the glmnet R package

  • This regression is a Gamma glm with a log link
  • Use cross validation across level 2 ecoregions to tune the lambda parameter in the LASSO model
  • this model is fit to using the scaled weather/climate/soils variables
  • this list of possible predictors includes:
    1. main effects
    2. interactions between all soils variables
    3. interactions between climate and weather variables
    4. transformed main effects (squared, log-transformed (add a uniform integer – 20– to all variables prior to log-transformation), square root-transformed (add a uniform integer – 20– to all variables prior to log-transformation))
## 
## Call:  cv.glmnet(x = X[, 2:ncol(X)], y = y, type.measure = "mse", foldid = my_folds,      keep = TRUE, parallel = TRUE, family = stats::Gamma(link = "log"),      alpha = 1, nlambda = 100, standardize = FALSE) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure    SE Nonzero
## min 0.0077    49   53.29 15.48      74
## 1se 0.5096     4   68.68 17.25       2

Then, fit regular glm models (Gamma glm with a log link), first using the coefficients from the ‘best’ lambda identified in the LASSO model, as then using the coefficients from the ‘1SE’ lambda identified from the LASSO (this is the value of lambda such that the cross validation error is within 1 standard error of the minimum).

## fit w/ the identified coefficients from the 'best' lambda, but using the glm function
  mat_glmnet_best <- as.matrix(bestLambda_coef)
  mat2_glmnet_best <- mat_glmnet_best[mat_glmnet_best[,1] != 0,]
  names(mat2_glmnet_best) <- rownames(mat_glmnet_best)[mat_glmnet_best[,1] != 0]

  if (length(mat2_glmnet_best) == 1) {
    f_glm_bestLambda <- as.formula(paste0(response, "~ 1"))
  } else {
  f_glm_bestLambda <- as.formula(paste0(response, " ~ ", paste0(names( mat2_glmnet_best)[2:length(names( mat2_glmnet_best))], collapse = " + ")))
  }
  
  fit_glm_bestLambda <- glm(data = modDat_1_s
                              , formula =  f_glm_bestLambda, family =  stats::Gamma(link = "log"))
  
   ## fit w/ the identified coefficients from the '1se' lambda, but using the glm function
  mat_glmnet_1se <- as.matrix(seLambda_coef)
  mat2_glmnet_1se <- mat_glmnet_1se[mat_glmnet_1se[,1] != 0,]
  names(mat2_glmnet_1se) <- rownames(mat_glmnet_1se)[mat_glmnet_1se[,1] != 0]
  if(length(mat2_glmnet_1se) == 1) {
    f_glm_1se <- as.formula(paste0(response, "~ 1"))
  } else {
  f_glm_1se <- as.formula(paste0(response, " ~ ", paste0(names( mat2_glmnet_1se)[2:length(names( mat2_glmnet_1se))], collapse = " + ")))
  }


  fit_glm_se <- glm(data = modDat_1_s, formula = f_glm_1se,
                    family =  stats::Gamma(link = "log"))

Then, we predict (on the training set) using both of these models (best lambda and 1se lambda)

  ## predict on the test data
  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_glm_bestLambda, newx=X[,2:ncol(X)], type = "response") - 2
  optimal_pred_1se <-  predict(fit_glm_se, newx=X[,2:ncol(X)], type = "response") - 2
    null_fit <- glm(#data = data.frame("y" = y, X[,paste0(prednames, "_s")]), 
      formula = y ~ 1, family = stats::Gamma(link = "log"))
  null_pred <- predict(null_fit, newdata = as.data.frame(X), type = "response"
                       ) - 2

  # save data
  fullModOut <- list(
    "modelObject" = fit,
    "nullModelObject" = null_fit,
    "modelPredictions" = data.frame(#ecoRegion_holdout = rep(test_eco,length(y)),
      obs=y,
                    pred_opt=optimal_pred, 
                    pred_opt_se = optimal_pred_1se,
                    pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ))
  
  
# calculate correlations between null and optimal model 
my_cors <- c(cor(optimal_pred, c(y-2)),
             cor(optimal_pred_1se, c(y-2)), 
            cor(null_pred, c(y-2))
            )

# calculate mse between null and optimal model 
my_mse <- c(mean((fullModOut$modelPredictions$pred_opt -  c(y-2))^2) ,
            mean((fullModOut$modelPredictions$pred_opt_se -  c(y-2))^2) ,
            mean((fullModOut$modelPredictions$pred_null - c(y-2))^2)#,
            #mean((obs_pred$pred_nopenalty - obs_pred$obs)^2)
            )

ggplot() + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$obs-2), col = "black", alpha = .1) + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt), col = "red", alpha = .1) + ## predictions w/ the CV model
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_se), col = "green", alpha = .1) + ## predictions w/ the CV model (1se lambda)
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_null), col = "blue", alpha = .1) + 
  labs(title = "A rough comparison of observed and model-predicted values", 
       subtitle = "black = observed values \n red = predictions from 'best lambda' model \n green = predictions from '1se' lambda model \n blue = predictions from null model") +
  xlab(colnames(X)[2])

  #ylim(c(0,200))

The internal cross-validation process to fit the global LASSO model identified an optimal lambda value (regularization parameter) of r{print(best_lambda)}. The lambda value such that the cross validation error is within 1 standard error of the minimum (“1se lambda”) was `r{print(fit$lambda.1se)}`` . The following coefficients were kept in each model:

# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
coef_glm_bestLambda <- coef(fit_glm_bestLambda) %>% 
  data.frame() 
coef_glm_bestLambda$coefficientName <- rownames(coef_glm_bestLambda)
names(coef_glm_bestLambda)[1] <- "coefficientValue_bestLambda"
# coefficient matrix from the '1se' model 
coef_glm_1se <- coef(fit_glm_se) %>% 
  data.frame() 
coef_glm_1se$coefficientName <- rownames(coef_glm_1se)
names(coef_glm_1se)[1] <- "coefficientValue_1seLambda"
# add together
coefs <- full_join(coef_glm_bestLambda, coef_glm_1se) %>% 
  select(coefficientName, coefficientValue_bestLambda, coefficientValue_1seLambda)

globModTerms <- coefs[!is.na(coefs$coefficientValue_bestLambda), "coefficientName"]

## also, get the number of unique variables in each model 
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
# for best lambda model
prednames_fig <- paste(str_split(globModTerms, ":", simplify = TRUE)) 
prednames_fig <- str_replace(prednames_fig, "I\\(", "")
prednames_fig <- str_replace(prednames_fig, "\\^2\\)", "")
prednames_fig <- unique(prednames_fig[prednames_fig>0])
prednames_fig <- prednames_fig
prednames_fig_num <- length(prednames_fig)
# for 1SE lambda model
globModTerms_1se <- coefs[!is.na(coefs$coefficientValue_1seLambda), "coefficientName"]
if (length(globModTerms_1se) == 1) {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE)) 
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- c(0)
} else {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE)) 
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- length(prednames_fig_1se)
}

# make a table
kable(coefs, col.names = c("Coefficient Name", "Value from best lambda model", "Value from 1se lambda model")
      ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Coefficient Name Value from best lambda model Value from 1se lambda model
(Intercept) 1.2487655 1.5840053
tmean_s -0.2662733 NA
prcp_s -0.2420847 NA
prcp_dry_s -0.1582773 NA
prcpTempCorr_s -0.0701652 NA
isothermality_s 0.0459890 NA
annWatDef_s 0.2073439 NA
clay_s 0.0578030 NA
coarse_s -0.1169261 NA
carbon_s -0.1220350 NA
AWHC_s -0.0935196 NA
prcp_anom_s 0.0051661 NA
precp_dry_anom_s 0.0058505 NA
prcp_seasonality_anom_s -0.0105121 NA
isothermality_anom_s -0.0495494 NA
annWetDegDays_anom_s 0.0036101 NA
frostFreeDays_5_anom_s -0.0107574 NA
I(tmean_s^2) 0.0913696 NA
I(prcp_s^2) 0.0275567 NA
I(prcp_dry_s^2) 0.0484797 NA
I(isothermality_s^2) -0.0216311 NA
I(prcp_wet_anom_s^2) -0.0038047 NA
I(precp_dry_anom_s^2) 0.0000785 0.0143714
I(aboveFreezingMonth_anom_s^2) 0.0027418 NA
I(annWatDef_anom_s^2) -0.0000483 NA
I(annWetDegDays_anom_s^2) 0.0082486 0.1217870
I(VPD_mean_anom_s^2) 0.0035583 NA
I(VPD_max_95_anom_s^2) -0.0059864 NA
I(frostFreeDays_5_anom_s^2) 0.0082701 NA
I(clay_s^2) -0.0125955 NA
I(sand_s^2) 0.0172446 NA
I(carbon_s^2) 0.0235691 NA
I(AWHC_s^2) -0.0100516 NA
frostFreeDays_5_anom_s:aboveFreezingMonth_anom_s 0.0076972 NA
isothermality_anom_s:aboveFreezingMonth_anom_s -0.0088724 NA
aboveFreezingMonth_anom_s:tmin_anom_s 0.0105731 NA
prcp_s:annWatDef_s -0.0935673 NA
prcpTempCorr_s:annWatDef_s 0.0491413 NA
tmean_s:annWatDef_s -0.1549043 NA
prcp_s:annWetDegDays_anom_s -0.0114714 NA
precp_dry_anom_s:annWetDegDays_anom_s -0.0054579 NA
annWetDegDays_anom_s:tmax_anom_s -0.0082487 NA
isothermality_s:frostFreeDays_5_anom_s 0.0513218 NA
prcp_s:frostFreeDays_5_anom_s 0.0315142 NA
prcpTempCorr_s:frostFreeDays_5_anom_s 0.0346309 NA
frostFreeDays_5_anom_s:tmin_anom_s 0.0140302 NA
frostFreeDays_5_anom_s:VPD_max_95_anom_s 0.0212634 NA
isothermality_s:isothermality_anom_s -0.0226157 NA
prcp_anom_s:isothermality_anom_s -0.0108624 NA
prcp_dry_s:isothermality_anom_s 0.0123221 NA
prcp_seasonality_anom_s:isothermality_anom_s 0.0025569 NA
prcpTempCorr_s:isothermality_anom_s 0.0241089 NA
precp_dry_anom_s:isothermality_anom_s -0.0064780 NA
isothermality_anom_s:tmax_anom_s 0.0080627 NA
prcp_dry_s:isothermality_s -0.0386451 NA
prcp_s:isothermality_s -0.0104304 NA
isothermality_s:tmax_anom_s 0.0166415 NA
isothermality_s:tmin_anom_s -0.0047678 NA
prcp_s:prcp_anom_s -0.0106963 NA
prcp_anom_s:tmin_anom_s -0.0105354 NA
prcp_dry_s:prcpTempCorr_s 0.0829879 NA
prcp_s:prcp_seasonality_anom_s 0.0179414 NA
prcp_s:prcpTempCorr_s -0.1928649 NA
prcp_s:tmax_anom_s -0.0063580 NA
prcp_s:VPD_max_95_anom_s 0.0296975 NA
prcpTempCorr_s:prcp_seasonality_anom_s -0.0026241 NA
tmean_s:prcp_seasonality_anom_s -0.0158362 NA
tmean_s:prcpTempCorr_anom_s -0.0094565 NA
prcpTempCorr_s:tmax_anom_s -0.0148230 NA
tmean_s:prcpTempCorr_s 0.0903759 NA
tmean_s:VPD_max_95_anom_s -0.0311635 NA
clay_s:AWHC_s 0.0094927 NA
coarse_s:AWHC_s -0.0621186 NA
coarse_s:carbon_s -0.0302285 NA
carbon_s:sand_s -0.0325578 NA
# calculate RMSE of both models 
RMSE_best <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt")]-2, truth = "obs", estimate = "pred_opt")$.estimate
RMSE_1se <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_se")]-2, truth = "obs", estimate = "pred_opt_se")$.estimate
bias_best <-  mean(fullModOut$modelPredictions$obs - fullModOut$modelPredictions$pred_opt)
bias_1se <- mean(fullModOut$modelPredictions$obs - fullModOut$modelPredictions$pred_opt_se)

uniqueCoeffs <- data.frame("Best lambda model" = c(RMSE_best, bias_best,
  as.integer(length(globModTerms)-1), as.integer(prednames_fig_num), 
                                                   as.integer(sum(prednames_fig %in% c(prednames_clim))),
                                                   as.integer(sum(prednames_fig %in% c(prednames_weath))),
                                                   as.integer(sum(prednames_fig %in% c(prednames_soils)))
                                                   ), 
                           "1se lambda model" = c(RMSE_1se, bias_1se,
                             length(globModTerms_1se)-1, prednames_fig_1se_num,
                                                   sum(prednames_fig_1se %in% c(prednames_clim)),
                                                   sum(prednames_fig_1se %in% c(prednames_weath)),
                                                   sum(prednames_fig_1se %in% c(prednames_soils))))
row.names(uniqueCoeffs) <- c("RMSE", "bias - mean(obs-pred.)", "Total number of coefficients", "Number of unique coefficients",
                             "Number of unique climate coefficients", 
                             "Number of unique weather coefficients",  
                             "Number of unique soils coefficients"
                             )

kable(uniqueCoeffs, 
      col.names = c("Best lambda model", "1se lambda model"), row.names = TRUE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Best lambda model 1se lambda model
RMSE 6.85393 450.4051794
bias - mean(obs-pred.) 2.01758 -0.7741009
Total number of coefficients 74.00000 2.0000000
Number of unique coefficients 25.00000 2.0000000
Number of unique climate coefficients 6.00000 0.0000000
Number of unique weather coefficients 14.00000 2.0000000
Number of unique soils coefficients 5.00000 0.0000000

Visualizations of Model Predictions and Residuals – using best lambda model

observed vs. predicted values

Predicting on the data

  # create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
  df_out <- df
  response_name <- paste0(response, "_pred")
  df_out <- df_out %>% cbind(predict(mod, newx= df_out, #s="lambda.min", 
                                     type = "response"))
   colnames(df_out)[ncol(df_out)] <- response_name
  return(df_out)
}

pred_glm1 <- predict_by_response(fit_glm_bestLambda, X[,2:ncol(X)])

# add back in true y values
pred_glm1 <- pred_glm1 %>% 
  cbind( data.frame("y" = y))
# rename the true response column to not be 'y_test' 
colnames(pred_glm1)[which(colnames(pred_glm1) == "y")] <- paste(response)

# add back in lat/long data 
pred_glm1 <- pred_glm1 %>% 
  cbind(modDat_1_s[,c("Long", "Lat", "Year")])

pred_glm1$resid <- pred_glm1[,response] - pred_glm1[,paste0(response, "_pred")]
pred_glm1$extremeResid <- NA
pred_glm1[pred_glm1$resid > 70 | pred_glm1$resid < -70,"extremeResid"] <- 1

# plot(x = pred_glm1[,response],
#      y = pred_glm1[,paste0(response, "_pred")],
#      xlab = "observed values", ylab = "predicted values")
# points(x = pred_glm1[!is.na(pred_glm1$extremeResid), response],
#        y = pred_glm1[!is.na(pred_glm1$extremeResid), paste0(response, "_pred")],
#        col = "red")
pred_glm1_1se <- predict_by_response(fit_glm_se, X[,2:ncol(X)])

# add back in true y values
pred_glm1_1se <- pred_glm1_1se %>% 
  cbind( data.frame("y" = y))
# rename the true response column to not be 'y_test' 
colnames(pred_glm1_1se)[which(colnames(pred_glm1_1se) == "y")] <- paste(response)

# add back in lat/long data 
pred_glm1_1se <- pred_glm1_1se %>% 
  cbind(modDat_1_s[,c("Long", "Lat", "Year")])

pred_glm1_1se$resid <- pred_glm1_1se[,response] - pred_glm1_1se[,paste0(response, "_pred")]
pred_glm1_1se$extremeResid <- NA
pred_glm1_1se[pred_glm1_1se$resid > 70 | pred_glm1_1se$resid < -70,"extremeResid"] <- 1

Maps of Observations, Predictions, and Residuals=

Observations across the temporal range of the dataset

pred_glm1 <- pred_glm1 %>% 
  mutate(resid = .[[response]] - .[[paste0(response,"_pred")]]) 

# rasterize
# get reference raster
test_rast <-  rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>% 
  terra::aggregate(fact = 8, fun = "mean")
## |---------|---------|---------|---------|=========================================                                          
## add ecoregion boundaries (for our ecoregion level model)
regions <- sf::st_read(dsn = "../../../Data_raw/Level2Ecoregions/", layer = "NA_CEC_Eco_Level2") 
## Reading layer `NA_CEC_Eco_Level2' from data source 
##   `/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Data_raw/Level2Ecoregions' using driver `ESRI Shapefile'
## Simple feature collection with 2261 features and 8 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -4334052 ymin: -3313739 xmax: 3324076 ymax: 4267265
## Projected CRS: Sphere_ARC_INFO_Lambert_Azimuthal_Equal_Area
regions <- regions %>% 
  st_transform(crs = st_crs(test_rast)) %>% 
  st_make_valid() #%>% 
  #st_crop(st_bbox(test_rast))
# 
# goodRegions_temp <- st_overlaps(y = cropped_states, x = regions, sparse = FALSE) %>% 
#   rowSums() 
# goodRegions <- regions[goodRegions_temp != 0,]

ecoregionLU <- data.frame("NA_L1NAME" = sort(unique(regions$NA_L1NAME)), 
                        "newRegion" = c(NA, "Forest", "dryShrubGrass", 
                                        "dryShrubGrass", "Forest", "dryShrubGrass",
                                       "dryShrubGrass", "Forest", "Forest", 
                                       "dryShrubGrass", "Forest", "Forest", 
                                       "Forest", "Forest", "dryShrubGrass", 
                                       NA
                                        ))
goodRegions <- regions %>% 
  left_join(ecoregionLU)
mapRegions <- goodRegions %>% 
  filter(!is.na(newRegion)) %>% 
  group_by(newRegion) %>% 
  summarise(geometry = sf::st_union(geometry)) %>% 
  ungroup() %>% 
  st_simplify(dTolerance = 1000)
#mapview(mapRegions)
# rasterize data
plotObs <- pred_glm1 %>% 
         drop_na(paste(response)) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = response, 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotObs, na.rm = TRUE)

plotObs_2 <- plotObs %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )
# make figures
ggplot() +
geom_spatraster(data = plotObs_2) + 
  geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
labs(title = paste0("Observations of ", response, " in the ",ecoregion, " ecoregion")) +
  scale_fill_gradient2(low = "brown",
                       mid = "wheat" ,
                       high = "darkgreen" , 
                       midpoint = 0,   na.value = "lightgrey") + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

Predictions across the temporal range of the dataset

# rasterize data
plotPred <- pred_glm1 %>% 
         drop_na(paste0(response,"_pred")) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = paste0(response,"_pred"), 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the point location of those predictions that are > 100
highPred_points <- pred_glm1 %>% 
  filter(.[[paste0(response,"_pred")]] > 100 | 
           .[[paste0(response, "_pred")]] < 0) %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotPred, na.rm = TRUE)

plotPred_2 <- plotPred %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )
# make figures
ggplot() +
geom_spatraster(data = plotPred_2) + 
  geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + 
  geom_sf(data = highPred_points, col = "red") +
labs(title = paste0("Predictions from the 'best lambda' fitted model of ", response, " in the ",ecoregion, " ecoregion"),
     subtitle =  "bestLambda model")  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 100,   na.value = "lightgrey",
                       limits = c(0,100)) + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

# rasterize data
plotPred <- pred_glm1_1se %>% 
         drop_na(paste0(response,"_pred")) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = paste0(response,"_pred"), 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the point location of those predictions that are > 100
highPred_points <- pred_glm1_1se %>% 
  filter(.[[paste0(response,"_pred")]] > 100 | 
           .[[paste0(response, "_pred")]] < 0) %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotPred, na.rm = TRUE)

plotPred_2 <- plotPred %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )
# make figures
ggplot() +
geom_spatraster(data = plotPred_2) + 
  geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + geom_sf(data = highPred_points, col = "red") +
labs(title = paste0("Predictions from the '1SE lambda' fitted model of ", response, " in the ",ecoregion, " ecoregion"),
     subtitle =  "1 SE Lambda model")  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 100,   na.value = "lightgrey",
                       limits = c(0,100)) + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

Residuals across the entire temporal extent of the dataset

# rasterize data
plotResid_rast <- pred_glm1 %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotResid_rast, na.rm = TRUE)

plotResid_rast_2 <- plotResid_rast %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1 %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- pred_glm1 %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
# make figures
map <- ggplot() +
geom_spatraster(data =plotResid_rast_2) + 
  geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + 
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from Grass/shrub ecoregion-wide model of ", response),
     subtitle = "bestLambda model \n red points indicate locations that have residuals below -100 \n blue points indicate locatiosn that have residuals above 100") +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" , 
                       midpoint = 0,   na.value = "lightgrey",
                       limits = c(-100,100)
                       ) + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])
hist <- ggplot(pred_glm1) + 
  geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") + 
  geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
  geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2))))

library(ggpubr)
ggarrange(map, hist, heights = c(3,1), ncol = 1)

# rasterize data
plotResid_rast <- pred_glm1_1se %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotResid_rast, na.rm = TRUE)

plotResid_rast_2 <- plotResid_rast %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1_1se %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- pred_glm1_1se %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
# make figures
map <- ggplot() +
geom_spatraster(data =plotResid_rast_2) + 
  geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + 
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from Grass/shrub ecoregion-wide model of ", response),
     subtitle = "1 SE Lambda model \n red points indicate locations that have residuals below -100 \n blue points indicate locatiosn that have residuals above 100") +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" , 
                       midpoint = 0,   na.value = "lightgrey",
                       limits = c(-100,100)
                       ) + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])
hist <- ggplot(pred_glm1_1se) + 
  geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") + 
  geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
  geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2))))

ggarrange(map, hist, heights = c(3,1), ncol = 1)

### Are there biases of the model predictions across year/lat/long?

# plot residuals against Year
yearResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = jitter(Year), y = resid), alpha = .1) + 
  geom_smooth(aes(x = Year, y = resid)) + 
  xlab("Year") + 
  ylab("Residual from best lambda model") +
  ggtitle("from best lamba model")
yearResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = jitter(Year), y = resid), alpha = .1) + 
  geom_smooth(aes(x = Year, y = resid)) + 
  xlab("Year") + 
  ylab("Residual from 1 SE lambda model")+
  ggtitle("from 1 SE lamba model")

# plot residuals against Lat
latResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = Lat, y = resid), alpha = .1) + 
  geom_smooth(aes(x = Lat, y = resid)) + 
  xlab("Latitude") + 
  ylab("Residual from best lambda model") +
  ggtitle("from best lamba model")
latResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = Lat, y = resid), alpha = .1) + 
  geom_smooth(aes(x = Lat, y = resid)) + 
  xlab("Latitude") + 
  ylab("Residual from 1 SE lambda model") +
  ggtitle("from 1 SE lamba model")

# plot residuals against Long
longResidMod_bestLambda <- ggplot(pred_glm1) + 
  geom_point(aes(x = Long, y = resid), alpha = .1) + 
  geom_smooth(aes(x = Long, y = resid)) + 
  xlab("Longitude") + 
  ylab("Residual from best lambda model") +
  ggtitle("from best lamba model")
longResidMod_1seLambda <- ggplot(pred_glm1_1se) + 
  geom_point(aes(x = Long, y = resid), alpha = .1) + 
  geom_smooth(aes(x = Long, y = resid)) + 
  xlab("Longitude") + 
  ylab("Residual from 1 SE lambda model") +
  ggtitle("from 1 SE lamba model")

library(patchwork)
(yearResidMod_bestLambda + yearResidMod_1seLambda) / 
(  latResidMod_bestLambda + latResidMod_1seLambda) /
(  longResidMod_bestLambda + longResidMod_1seLambda)

Quantile plots

Binning predictor variables into “Deciles” (actually percentiles) and looking at the mean predicted probability for each percentile. The use of the word Decentiles is just a legacy thing (they started out being actual Percentiles)

# get deciles for best lambda model 
if (length(prednames_fig) == 0) {
  print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles <- predvars2deciles(pred_glm1,
                                      response_vars = response_vars,
                                        pred_vars = prednames_fig, 
                                       cut_points = seq(0, 1, 0.005))
}
# get deciles for 1 SE lambda model 
if (length(prednames_fig_1se) == 0) {
  print("The 1SE lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles_1se <- predvars2deciles(pred_glm1_1se,
                                      response_vars = response_vars,
                                        pred_vars = prednames_fig_1se, 
                                       cut_points = seq(0, 1, 0.005))
}

Here is a quick version of LOESS curves fit to raw data (to double-check the quantile plot calculations)

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1 %>%
  select(all_of(c(prednames_fig, response_vars))) %>%
  pivot_longer(cols = prednames_fig)  %>%
  ggplot() +
  facet_wrap(~name, scales = "free") +
  geom_point(aes(x = value, y =  .data[[paste(response)]]), col = "darkblue", alpha = .1)  + # observed values
  geom_point(aes(x = value, y = .data[[response_vars[2]]]), col = "lightblue", alpha = .1) + # model-predicted values
  geom_smooth(aes(x = value, y =  .data[[paste(response)]]), col = "black", se = FALSE) +
  geom_smooth(aes(x = value, y = .data[[response_vars[2]]]), col = "brown", se = FALSE)

}

Below are the actual quantile plots (note that the predictor variables are scaled)

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {

# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles, response = response, IQR = TRUE) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles, dfRaw = pred_glm1, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  png(paste0("figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
     units = "in", res = 600, width = 5.5, height = 3.5 )
    print(g4)
  dev.off()
}

g4
}

if (length(prednames_fig_1se) == 0) {
  print("The 1 se lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")

  } else {

# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles_1se, response = response, IQR = TRUE) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles_1se, dfRaw = pred_glm1_1se, add_smooth = TRUE, deciles = FALSE)

  
if(save_figs) {
  png(paste0("figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
     units = "in", res = 600, width = 5.5, height = 3.5 )
    print(g4)
  dev.off()
}

g4
}

Deciles Filtered

20th and 80th percentiles for each climate variable

df <- pred_glm1[, prednames_fig] #%>% 
  #mutate(MAT = MAT - 273.15) # k to c
quantiles <- map(df, quantile, probs = c(0.2, 0.8), na.rm = TRUE)

Filtered ‘Decile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower two deciles of each predictor variable.

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = prednames_fig,
                         filter_var = TRUE,
                         filter_vars = prednames_fig,
                         cut_points = seq(0, 1, 0.005)) 

decile_dotplot_filtered_pq(pred_glm1_deciles_filt, xvars = prednames_fig)
#decile_dotplot_filtered_pq(pred_glm1_deciles_filt)

}
## Processed 16037 groups out of 238227. 7% done. Time elapsed: 3s. ETA: 41s.Processed 21602 groups out of 238227. 9% done. Time elapsed: 4s. ETA: 40s.Processed 27029 groups out of 238227. 11% done. Time elapsed: 5s. ETA: 39s.Processed 32301 groups out of 238227. 14% done. Time elapsed: 6s. ETA: 38s.Processed 36956 groups out of 238227. 16% done. Time elapsed: 7s. ETA: 38s.Processed 42166 groups out of 238227. 18% done. Time elapsed: 8s. ETA: 37s.Processed 47375 groups out of 238227. 20% done. Time elapsed: 9s. ETA: 36s.Processed 52806 groups out of 238227. 22% done. Time elapsed: 10s. ETA: 35s.Processed 58328 groups out of 238227. 24% done. Time elapsed: 11s. ETA: 33s.Processed 63871 groups out of 238227. 27% done. Time elapsed: 12s. ETA: 32s.Processed 69415 groups out of 238227. 29% done. Time elapsed: 13s. ETA: 31s.Processed 74922 groups out of 238227. 31% done. Time elapsed: 14s. ETA: 30s.Processed 80344 groups out of 238227. 34% done. Time elapsed: 15s. ETA: 29s.Processed 85747 groups out of 238227. 36% done. Time elapsed: 16s. ETA: 28s.Processed 91198 groups out of 238227. 38% done. Time elapsed: 17s. ETA: 27s.Processed 96577 groups out of 238227. 41% done. Time elapsed: 18s. ETA: 26s.Processed 101643 groups out of 238227. 43% done. Time elapsed: 19s. ETA: 25s.Processed 107123 groups out of 238227. 45% done. Time elapsed: 20s. ETA: 24s.Processed 112607 groups out of 238227. 47% done. Time elapsed: 21s. ETA: 23s.Processed 118021 groups out of 238227. 50% done. Time elapsed: 22s. ETA: 22s.Processed 123477 groups out of 238227. 52% done. Time elapsed: 23s. ETA: 21s.Processed 128966 groups out of 238227. 54% done. Time elapsed: 24s. ETA: 20s.Processed 134410 groups out of 238227. 56% done. Time elapsed: 25s. ETA: 19s.Processed 139936 groups out of 238227. 59% done. Time elapsed: 26s. ETA: 18s.Processed 145323 groups out of 238227. 61% done. Time elapsed: 27s. ETA: 17s.Processed 150815 groups out of 238227. 63% done. Time elapsed: 28s. ETA: 16s.Processed 156372 groups out of 238227. 66% done. Time elapsed: 29s. ETA: 15s.Processed 161913 groups out of 238227. 68% done. Time elapsed: 30s. ETA: 14s.Processed 167349 groups out of 238227. 70% done. Time elapsed: 31s. ETA: 13s.Processed 172873 groups out of 238227. 73% done. Time elapsed: 32s. ETA: 12s.Processed 178396 groups out of 238227. 75% done. Time elapsed: 33s. ETA: 11s.Processed 183932 groups out of 238227. 77% done. Time elapsed: 34s. ETA: 10s.Processed 189221 groups out of 238227. 79% done. Time elapsed: 35s. ETA: 9s.Processed 194479 groups out of 238227. 82% done. Time elapsed: 36s. ETA: 8s.Processed 199733 groups out of 238227. 84% done. Time elapsed: 37s. ETA: 7s.Processed 205275 groups out of 238227. 86% done. Time elapsed: 38s. ETA: 6s.Processed 210733 groups out of 238227. 88% done. Time elapsed: 39s. ETA: 5s.Processed 216263 groups out of 238227. 91% done. Time elapsed: 40s. ETA: 4s.Processed 221808 groups out of 238227. 93% done. Time elapsed: 41s. ETA: 3s.Processed 227318 groups out of 238227. 95% done. Time elapsed: 42s. ETA: 2s.Processed 232860 groups out of 238227. 98% done. Time elapsed: 43s. ETA: 0s.Processed 238227 groups out of 238227. 100% done. Time elapsed: 44s. ETA: 0s.

Filtered quantile figure with middle 2 deciles also shown

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = prednames_fig,
                         filter_vars = prednames_fig,
                         filter_var = TRUE,
                         add_mid = TRUE,
                         cut_points = seq(0, 1, 0.005))

g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, xvars = prednames_fig)
g

if(save_figs) {x
jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
     units = "in", res = 600, width = 5.5, height = 6 )
  g 
dev.off()
}
}
## Processed 7922 groups out of 357540. 2% done. Time elapsed: 3s. ETA: 132s.Processed 13585 groups out of 357540. 4% done. Time elapsed: 4s. ETA: 101s.Processed 19276 groups out of 357540. 5% done. Time elapsed: 5s. ETA: 87s.Processed 24318 groups out of 357540. 7% done. Time elapsed: 6s. ETA: 82s.Processed 29883 groups out of 357540. 8% done. Time elapsed: 7s. ETA: 77s.Processed 35494 groups out of 357540. 10% done. Time elapsed: 8s. ETA: 72s.Processed 41117 groups out of 357540. 11% done. Time elapsed: 9s. ETA: 69s.Processed 46656 groups out of 357540. 13% done. Time elapsed: 10s. ETA: 66s.Processed 51623 groups out of 357540. 14% done. Time elapsed: 11s. ETA: 65s.Processed 56901 groups out of 357540. 16% done. Time elapsed: 12s. ETA: 63s.Processed 62159 groups out of 357540. 17% done. Time elapsed: 13s. ETA: 62s.Processed 67438 groups out of 357540. 19% done. Time elapsed: 14s. ETA: 60s.Processed 72866 groups out of 357540. 20% done. Time elapsed: 15s. ETA: 58s.Processed 78465 groups out of 357540. 22% done. Time elapsed: 16s. ETA: 57s.Processed 83902 groups out of 357540. 23% done. Time elapsed: 17s. ETA: 55s.Processed 89470 groups out of 357540. 25% done. Time elapsed: 18s. ETA: 54s.Processed 95041 groups out of 357540. 27% done. Time elapsed: 19s. ETA: 52s.Processed 100663 groups out of 357540. 28% done. Time elapsed: 20s. ETA: 51s.Processed 106239 groups out of 357540. 30% done. Time elapsed: 21s. ETA: 50s.Processed 111871 groups out of 357540. 31% done. Time elapsed: 22s. ETA: 48s.Processed 117462 groups out of 357540. 33% done. Time elapsed: 23s. ETA: 47s.Processed 123031 groups out of 357540. 34% done. Time elapsed: 24s. ETA: 46s.Processed 128466 groups out of 357540. 36% done. Time elapsed: 25s. ETA: 44s.Processed 133496 groups out of 357540. 37% done. Time elapsed: 26s. ETA: 43s.Processed 139088 groups out of 357540. 39% done. Time elapsed: 27s. ETA: 42s.Processed 144555 groups out of 357540. 40% done. Time elapsed: 28s. ETA: 41s.Processed 149519 groups out of 357540. 42% done. Time elapsed: 29s. ETA: 40s.Processed 155085 groups out of 357540. 43% done. Time elapsed: 30s. ETA: 39s.Processed 160678 groups out of 357540. 45% done. Time elapsed: 31s. ETA: 38s.Processed 165750 groups out of 357540. 46% done. Time elapsed: 32s. ETA: 37s.Processed 171206 groups out of 357540. 48% done. Time elapsed: 33s. ETA: 36s.Processed 175683 groups out of 357540. 49% done. Time elapsed: 34s. ETA: 36s.Processed 181359 groups out of 357540. 51% done. Time elapsed: 35s. ETA: 34s.Processed 187037 groups out of 357540. 52% done. Time elapsed: 36s. ETA: 33s.Processed 192693 groups out of 357540. 54% done. Time elapsed: 37s. ETA: 32s.Processed 198043 groups out of 357540. 55% done. Time elapsed: 38s. ETA: 31s.Processed 203694 groups out of 357540. 57% done. Time elapsed: 39s. ETA: 30s.Processed 209342 groups out of 357540. 59% done. Time elapsed: 40s. ETA: 28s.Processed 215004 groups out of 357540. 60% done. Time elapsed: 41s. ETA: 27s.Processed 220398 groups out of 357540. 62% done. Time elapsed: 42s. ETA: 26s.Processed 226085 groups out of 357540. 63% done. Time elapsed: 43s. ETA: 25s.Processed 231521 groups out of 357540. 65% done. Time elapsed: 44s. ETA: 24s.Processed 237059 groups out of 357540. 66% done. Time elapsed: 45s. ETA: 23s.Processed 242621 groups out of 357540. 68% done. Time elapsed: 46s. ETA: 22s.Processed 247720 groups out of 357540. 69% done. Time elapsed: 47s. ETA: 21s.Processed 253347 groups out of 357540. 71% done. Time elapsed: 48s. ETA: 20s.Processed 258979 groups out of 357540. 72% done. Time elapsed: 49s. ETA: 19s.Processed 264598 groups out of 357540. 74% done. Time elapsed: 50s. ETA: 17s.Processed 270074 groups out of 357540. 76% done. Time elapsed: 52s. ETA: 16s.Processed 275666 groups out of 357540. 77% done. Time elapsed: 53s. ETA: 15s.Processed 281182 groups out of 357540. 79% done. Time elapsed: 54s. ETA: 14s.Processed 286667 groups out of 357540. 80% done. Time elapsed: 55s. ETA: 13s.Processed 291959 groups out of 357540. 82% done. Time elapsed: 56s. ETA: 12s.Processed 297207 groups out of 357540. 83% done. Time elapsed: 57s. ETA: 11s.Processed 302350 groups out of 357540. 85% done. Time elapsed: 58s. ETA: 10s.Processed 307962 groups out of 357540. 86% done. Time elapsed: 59s. ETA: 9s.Processed 313529 groups out of 357540. 88% done. Time elapsed: 60s. ETA: 8s.Processed 319121 groups out of 357540. 89% done. Time elapsed: 61s. ETA: 7s.Processed 324681 groups out of 357540. 91% done. Time elapsed: 62s. ETA: 6s.Processed 330304 groups out of 357540. 92% done. Time elapsed: 63s. ETA: 5s.Processed 335735 groups out of 357540. 94% done. Time elapsed: 64s. ETA: 4s.Processed 341372 groups out of 357540. 95% done. Time elapsed: 65s. ETA: 3s.Processed 347023 groups out of 357540. 97% done. Time elapsed: 66s. ETA: 2s.Processed 352053 groups out of 357540. 98% done. Time elapsed: 67s. ETA: 1s.Processed 357540 groups out of 357540. 100% done. Time elapsed: 68s. ETA: 0s.

Cross-validation

Using best lambda model

Use terms from global model to re-fit and predict on different held out regions

Figures show residuals for each of the models fit to held-out ecoregions

These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
  
## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
#                        pred_opt = numeric(), pred_null = numeric()#,
#                        #pred_nopenalty = numeric()
#                        )

## get the model specification from the global model
mat <- as.matrix(coef(fit_glm_bestLambda, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]

f_cv <- as.formula(paste0(response, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))

X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response])

  
# now, loop through so with each iteration, a different ecoregion is held out
 for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){

  # split into training and test sets
  test_eco <- i_eco
  print(test_eco)
  # identify the rowID of observations to be in the training and test datasets
  train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
  test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'

  trainDat_all <- modDat_1_s %>% 
    slice(train) %>% 
    select(-newRegion)
  testDat_all <- modDat_1_s %>% 
    slice(test) %>% 
    select(-newRegion)

  # get the model matrices for input and response variables for cross validation model specification
  X_train <- as.matrix(X_cv[train,])
  X_test <- as.matrix(X_cv[test,])

  y_train <- modDat_1_s[train,response]
  y_test <- modDat_1_s[test,response]
  
  # get the model matrices for input and response variables for original model specification
  X_train_glob <- as.matrix(X[train,])
  X_test_glob <- as.matrix(X[test,])

  y_train_glob <- modDat_1_s[train,response]
  y_test_glob <- modDat_1_s[test,response]

  train_eco <- modDat_1_s$NA_L2NAME[train]

  ## just try a regular glm w/ the components from the global model
  fit_i <- glm(data = trainDat_all, formula = f_cv, 
    ,
               family =  stats::Gamma(link = "log")
    )
    
  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response"
                          )
  # null model and predictions
  # the "null" model in this case is the global model
  # predict on the test data for this iteration w/ the global model 
  null_pred <- predict.glm(fit_glm_bestLambda, newdata = testDat_all, type = "response")

  
  # save data
  tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),obs=y_test,
                    pred_opt=optimal_pred, pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ) %>%
    cbind(testDat_all)
  
  # calculate RMSE, bias, etc. of 
  # RMSE of CV model 
  RMSE_optimal <- yardstick::rmse(data = data.frame(optimal_pred, y_test), truth = "y_test", estimate = "optimal_pred")[1,]$.estimate
  # RMSE of global model
  RMSE_null <- yardstick::rmse(data = data.frame(null_pred, y_test), truth = "y_test", estimate = "null_pred")[1,]$.estimate
  # bias of CV model
  bias_optimal <- mean(y_test - optimal_pred)
  # bias of global model
  bias_null <-  mean( y_test - null_pred )
  
  # put output into a list
  tmpList <- list("testRegion" = i_eco,
    "modelObject" = fit_i,
       "modelPredictions" = tmp, 
    "performanceMetrics" = data.frame("RMSE_cvModel" = RMSE_optimal, 
                                      "RMSE_globalModel" = RMSE_null, 
                                      "bias_cvModel" = bias_optimal, 
                                      "bias_globalModel" = bias_null))

  # save model outputs
  outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
 }
}
## [1] "ATLANTIC HIGHLANDS"
## [1] "CENTRAL USA PLAINS"
## [1] "EVERGLADES"
## [1] "MARINE WEST COAST FOREST"
## [1] "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"
## [1] "MIXED WOOD PLAINS"
## [1] "MIXED WOOD SHIELD"
## [1] "OZARK/OUACHITA-APPALACHIAN FORESTS"
## [1] "SOUTHEASTERN USA PLAINS"
## [1] "UPPER GILA MOUNTAINS"
## [1] "WESTERN CORDILLERA"

Below are the RMSE and bias values for predictions made for each holdout level II ecoregion, compared to predictions from the global model for that same ecoregion

# table of model performance
map(outList, .f = function(x) {
  cbind(data.frame("holdout region" = x$testRegion),  x$performanceMetrics)
}
) %>% 
  purrr::list_rbind() %>% 
  kable(col.names = c("Held-out ecoregion", "RMSE of CV model", "RMSE of global model", 
                      "bias of CV model - mean(obs-pred.)", "bias of global model- mean(obs-pred.)"), 
        caption = "Performance of Cross Validation using 'best lambda' model specification") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Performance of Cross Validation using ‘best lambda’ model specification
Held-out ecoregion RMSE of CV model RMSE of global model bias of CV model - mean(obs-pred.) bias of global model- mean(obs-pred.)
ATLANTIC HIGHLANDS 0.6784331 0.6087695 -0.1659645 -0.0538055
CENTRAL USA PLAINS 1.2970481 1.2937379 0.0890837 0.0786248
EVERGLADES 3.2323809 3.0138809 1.4248381 0.9402743
MARINE WEST COAST FOREST 9.2052700 8.5973029 -2.2182168 -1.0441827
MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS 2.0100066 1.9590364 0.1051420 -0.0755078
MIXED WOOD PLAINS 0.9651751 0.9539728 -0.0953374 -0.0591810
MIXED WOOD SHIELD 0.9442755 0.9286658 -0.0130296 -0.0509354
OZARK/OUACHITA-APPALACHIAN FORESTS 1.0343639 1.0210701 0.0940716 0.0085500
SOUTHEASTERN USA PLAINS 1.5033915 1.4797245 -0.1144934 -0.0242760
UPPER GILA MOUNTAINS 13.1162655 10.8147292 -6.3211915 -1.1433401
WESTERN CORDILLERA 9.9056024 8.6445173 4.1162342 0.2644402
# visualize model predictions
for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
  holdoutRegion <- outList[[i]]$testRegion
  predictionData <- outList[[i]]$modelPredictions
  modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
    as.data.frame() %>%
    filter(V1!=0) %>%
    rownames()

  # calculate residuals
  predictionData <- predictionData %>%
  mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
         resid_globMod = .[["obs"]]  - .[["pred_null"]])


# rasterize
# use 'test_rast' from earlier

  # rasterize data
plotObs <- predictionData %>%
         drop_na(paste(response)) %>%
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>%
  terra::set.crs(crs(test_rast)) %>%
  terra::rasterize(y = test_rast,
                   field = "resid",
                   fun = mean) #%>%
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

tempExt <- crds(plotObs, na.rm = TRUE)

plotObs_2 <- plotObs %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- predictionData %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 


# make figures
# make histogram
hist_i <- ggplot(predictionData) +
  geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
  xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <-  ggplot() +
geom_spatraster(data = plotObs_2) +
  geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
     subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" ,
                       midpoint = 0,   na.value = "lightgrey",
                       limits = c(-100, 100))  + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

 assign(paste0("residPlot_",holdoutRegion),
   value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)

}

  lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
    get(paste0("residPlot_", x))
  })
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

Using 1se lambda model

Use terms from global model to re-fit and predict on different held out regions

Figures show residuals for each of the models fit to held-out ecoregions

These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion

if (length(prednames_fig_1se) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {

## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
#                        pred_opt = numeric(), pred_null = numeric()#,
#                        #pred_nopenalty = numeric()
#                        )

## get the model specification from the global model
mat <- as.matrix(coef(fit_glm_se, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]

f_cv <- as.formula(paste0(response, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))

X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response])

  
# now, loop through so with each iteration, a different ecoregion is held out
 for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){

  # split into training and test sets
  test_eco <- i_eco
  print(test_eco)
  # identify the rowID of observations to be in the training and test datasets
  train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
  test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'

  trainDat_all <- modDat_1_s %>% 
    slice(train) %>% 
    select(-newRegion)
  testDat_all <- modDat_1_s %>% 
    slice(test) %>% 
    select(-newRegion)

  # get the model matrices for input and response variables for cross validation model specification
  X_train <- as.matrix(X_cv[train,])
  X_test <- as.matrix(X_cv[test,])

  y_train <- modDat_1_s[train,response]
  y_test <- modDat_1_s[test,response]
  
  # get the model matrices for input and response variables for original model specification
  X_train_glob <- as.matrix(X[train,])
  X_test_glob <- as.matrix(X[test,])

  y_train_glob <- modDat_1_s[train,response]
  y_test_glob <- modDat_1_s[test,response]

  train_eco <- modDat_1_s$NA_L2NAME[train]

  ## just try a regular glm w/ the components from the global model
  fit_i <- glm(data = trainDat_all, formula = f_cv, 
    ,
               family =  stats::Gamma(link = "log")
    )

    coef(fit_i)
    
  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response"
                          )
  # null model and predictions
  # the "null" model in this case is the global model
  # predict on the test data for this iteration w/ the global model 
  null_pred <- predict.glm(fit_glm_se, newdata = testDat_all, type = "response")

  # save data
  tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),obs=y_test,
                    pred_opt=optimal_pred, pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ) %>%
    cbind(testDat_all)
    
  # calculate RMSE, bias, etc. of 
  # RMSE of CV model 
  RMSE_optimal <- yardstick::rmse(data = data.frame(optimal_pred, y_test), truth = "y_test", estimate = "optimal_pred")[1,]$.estimate
  # RMSE of global model
  RMSE_null <- yardstick::rmse(data = data.frame(null_pred, y_test), truth = "y_test", estimate = "null_pred")[1,]$.estimate
  # bias of CV model
  bias_optimal <- mean(y_test - optimal_pred)
  # bias of global model
  bias_null <-  mean(y_test - null_pred )
  
  # put output into a list
  tmpList <- list("testRegion" = i_eco,
    "modelObject" = fit_i,
       "modelPredictions" = tmp, 
    "performanceMetrics" = data.frame("RMSE_cvModel" = RMSE_optimal, 
                                      "RMSE_globalModel" = RMSE_null, 
                                      "bias_cvModel" = bias_optimal, 
                                      "bias_globalModel" = bias_null))

  # save model outputs
  outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
 }
}
## [1] "ATLANTIC HIGHLANDS"
## [1] "CENTRAL USA PLAINS"
## [1] "EVERGLADES"
## [1] "MARINE WEST COAST FOREST"
## [1] "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"
## [1] "MIXED WOOD PLAINS"
## [1] "MIXED WOOD SHIELD"
## [1] "OZARK/OUACHITA-APPALACHIAN FORESTS"
## [1] "SOUTHEASTERN USA PLAINS"
## [1] "UPPER GILA MOUNTAINS"
## [1] "WESTERN CORDILLERA"

Below are the RMSE and bias values for predictions made for each holdout level II ecoregion, compared to predictions from the global model for that same ecoregion

if (length(prednames_fig_1se) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
# table of model performance
map(outList, .f = function(x) {
  cbind(data.frame("holdout region" = x$testRegion),  x$performanceMetrics)
}
) %>% 
  purrr::list_rbind() %>% 
  kable(col.names = c("Held-out ecoregion", "RMSE of CV model", "RMSE of global model", 
                      "bias of CV model - mean(obs-pred.)", "bias of global model - mean(obs-pred.)"), 
        caption = "Performance of Cross Validation using '1 SE lambda' model specification") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
}
Performance of Cross Validation using ‘1 SE lambda’ model specification
Held-out ecoregion RMSE of CV model RMSE of global model bias of CV model - mean(obs-pred.) bias of global model - mean(obs-pred.)
ATLANTIC HIGHLANDS 2.978743e+00 2.844148 -2.924473e+00 -2.7871653
CENTRAL USA PLAINS 2.487548e+00 2.469400 -2.115177e+00 -2.0937609
EVERGLADES 3.021256e+00 3.020758 -9.145196e-01 -0.9128580
MARINE WEST COAST FOREST 9.549817e+00 9.144240 -5.500817e-01 -0.4640511
MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS 2.978871e+00 2.889121 -2.216161e+00 -2.0932973
MIXED WOOD PLAINS 2.920443e+00 2.754377 -2.754753e+00 -2.5777841
MIXED WOOD SHIELD 2.881130e+00 2.717182 -2.733544e+00 -2.5598066
OZARK/OUACHITA-APPALACHIAN FORESTS 2.999373e+00 2.658647 -2.821083e+00 -2.4551473
SOUTHEASTERN USA PLAINS 3.216337e+00 2.760984 -2.813828e+00 -2.2764121
UPPER GILA MOUNTAINS 2.552231e+01 20.958954 4.029081e+00 4.1357384
WESTERN CORDILLERA 1.108305e+07 669.704214 -7.357850e+04 -4.1332499
if (length(prednames_fig_1se) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
  holdoutRegion <- outList[[i]]$testRegion
  predictionData <- outList[[i]]$modelPredictions
  modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
    as.data.frame() %>%
    filter(V1!=0) %>%
    rownames()

  # calculate residuals
  predictionData <- predictionData %>%
  mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
         resid_globMod = .[["obs"]]  - .[["pred_null"]])


# rasterize
# use 'test_rast' from earlier

  # rasterize data
plotObs <- predictionData %>%
         drop_na(paste(response)) %>%
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>%
  terra::set.crs(crs(test_rast)) %>%
  terra::rasterize(y = test_rast,
                   field = "resid",
                   fun = mean) #%>%
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

tempExt <- crds(plotObs, na.rm = TRUE)

plotObs_2 <- plotObs %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- predictionData %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 


# make figures
# make histogram
hist_i <- ggplot(predictionData) +
  geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
  xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <-  ggplot() +
geom_spatraster(data = plotObs_2) +
  geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
     subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" ,
                       midpoint = 0,   na.value = "lightgrey",
                       limits = c(-100, 100))  + 
  xlim(st_bbox(plotObs_2)[c(1,3)]) + 
  ylim(st_bbox(plotObs_2)[c(2,4)])

 assign(paste0("residPlot_",holdoutRegion),
   value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)

}

  lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
    get(paste0("residPlot_", x))
  })
}
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

Save output

# # glm models
# #mods2save <- butcher::butcher(mod_glmFinal) # removes some model components so the saved object isn't huge
# 
# #mods2save$formula <- best_form
# #mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
# #n <- nrow(df_sample)
# #mods2save$data_rows <- n
# 
# 
# if(!test_run) {
#   saveRDS(mods2save, 
#         paste0("./models/glm_beta_model_CONUSwide_", s, "_n", n, 
#         #sample_group, 
#         ".RDS"))
#   if (byRegion == TRUE) {
#     ## western forests
#      saveRDS(mods2save_WF, 
#         paste0("./models/glm_beta_model_WesternForests_", s, "_n", n, 
#         #sample_group, 
#         ".RDS"))
#     ## eastern forests
#      saveRDS(mods2save_EF, 
#         paste0("./models/glm_beta_model_EasternForests_", s, "_n", n, 
#         #sample_group, 
#         ".RDS"))
#      ## grass/shrub
#      saveRDS(mods2save_G, 
#         paste0("./models/glm_beta_model_GrassShrub_", s, "_n", n, 
#         #sample_group, 
#         ".RDS"))
#   }
# }
## partial dependence plots
#vip::vip(mod_glmFinal, num_features = 15)

#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)

#caret::varImp(fit)

session info

Hash of current commit (i.e. to ID the version of the code used)

system("git rev-parse HEAD", intern=TRUE)
## [1] "79890c55a196d40eb16ae968701c4515b44c260c"

Packages etc.

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.7.5
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] doMC_1.3.8                 iterators_1.0.14           foreach_1.5.2              ggpubr_0.6.0               factoextra_1.0.7          
##  [6] USA.state.boundaries_1.0.1 glmnet_4.1-8               Matrix_1.7-0               kableExtra_1.4.0           rsample_1.2.1             
## [11] here_1.0.1                 StepBeta_2.1.0             ggtext_0.1.2               knitr_1.49                 gridExtra_2.3             
## [16] pdp_0.8.2                  GGally_2.2.1               lubridate_1.9.4            forcats_1.0.0              stringr_1.5.1             
## [21] dplyr_1.1.4                purrr_1.0.4                readr_2.1.5                tidyr_1.3.1                tibble_3.2.1              
## [26] tidyverse_2.0.0            caret_6.0-94               lattice_0.22-6             ggplot2_3.5.1              sf_1.0-20                 
## [31] tidyterra_0.6.1            terra_1.8-21               ggspatial_1.1.9            dtplyr_1.3.1               patchwork_1.3.0           
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3   rstudioapi_0.17.1    jsonlite_1.9.1       shape_1.4.6.1        magrittr_2.0.3       modeltools_0.2-23   
##   [7] farver_2.1.2         rmarkdown_2.29       vctrs_0.6.5          rstatix_0.7.2        htmltools_0.5.8.1    broom_1.0.7         
##  [13] Formula_1.2-5        pROC_1.18.5          sass_0.4.9           parallelly_1.37.1    KernSmooth_2.23-22   bslib_0.9.0         
##  [19] plyr_1.8.9           sandwich_3.1-0       zoo_1.8-12           cachem_1.1.0         commonmark_1.9.1     lifecycle_1.0.4     
##  [25] pkgconfig_2.0.3      R6_2.6.1             fastmap_1.2.0        future_1.33.2        digest_0.6.37        colorspace_2.1-1    
##  [31] furrr_0.3.1          rprojroot_2.0.4      pkgload_1.3.4        labeling_0.4.3       yardstick_1.3.1      timechange_0.3.0    
##  [37] mgcv_1.9-1           abind_1.4-8          compiler_4.4.0       proxy_0.4-27         aod_1.3.3            withr_3.0.2         
##  [43] backports_1.5.0      carData_3.0-5        betareg_3.1-4        DBI_1.2.3            ggstats_0.9.0        ggsignif_0.6.4      
##  [49] MASS_7.3-60.2        lava_1.8.0           classInt_0.4-10      gtools_3.9.5         ModelMetrics_1.2.2.2 tools_4.4.0         
##  [55] units_0.8-5          lmtest_0.9-40        future.apply_1.11.2  nnet_7.3-19          glue_1.8.0           nlme_3.1-164        
##  [61] gridtext_0.1.5       grid_4.4.0           reshape2_1.4.4       generics_0.1.3       recipes_1.1.0        gtable_0.3.6        
##  [67] tzdb_0.4.0           class_7.3-22         data.table_1.17.0    hms_1.1.3            utf8_1.2.4           car_3.1-2           
##  [73] xml2_1.3.7           flexmix_2.3-19       markdown_1.13        ggrepel_0.9.5        pillar_1.10.1        splines_4.4.0       
##  [79] survival_3.5-8       tidyselect_1.2.1     svglite_2.1.3        stats4_4.4.0         xfun_0.51            hardhat_1.4.0       
##  [85] timeDate_4032.109    stringi_1.8.4        yaml_2.3.10          evaluate_1.0.3       codetools_0.2-20     cli_3.6.4           
##  [91] rpart_4.1.23         systemfonts_1.2.1    munsell_0.5.1        jquerylib_0.1.4      Rcpp_1.0.14          globals_0.16.3      
##  [97] gower_1.0.1          listenv_0.9.1        viridisLite_0.4.2    ipred_0.9-15         scales_1.3.0         prodlim_2024.06.25  
## [103] e1071_1.7-14         crayon_1.5.3         combinat_0.0-8       rlang_1.1.5          cowplot_1.1.3